%pylab inline
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import pickle
import itertools
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools
import warnings
from itertools import product
warnings.filterwarnings('ignore')
import statsmodels
statsmodels.__version__
def invboxcox(y,lmbda):
if lmbda == 0:
return(np.exp(y))
else:
return(np.exp(np.log(lmbda*y+1)/lmbda))
mypath = "H:/Yandex machine learning/finall course coursera/csv_yellow_cab_2015/reg_bin_stat/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(onlyfiles)
ESB_reg = 1231
month = []
for file in onlyfiles[1:] + [onlyfiles[0]]:
with open(mypath + file, "rb") as f:
region_bined_stat = pickle.load(f)
month.append(region_bined_stat[ESB_reg-1, :])
for i in month:
print(len(i), end=" ")
month_join = list(itertools.chain.from_iterable(month))
time = pd.date_range('2015 Jan 1 00:00:00', periods = len(month_join), freq = 'h')
taxi_data = pd.DataFrame({"taxi_call_num" : month_join}, index = [time])
taxi_data.head()
print(taxi_data.index.values.shape)
print(taxi_data.taxi_call_num.values.shape)
fig, axes = plt.subplots(figsize = (80, 8))
axes.title.set_size(40)
taxi_data.plot(color="green", title="timeserie", fontsize=25, ax = axes)
frequency = 24
plt.xticks(time[::24])
plt.grid()
#double click on image makes it larger
#num of hours in each month
month_len = np.array([0, 744, 672, 744, 720, 744, 720])
for i, c in enumerate(["January", "February", "March"]):
taxi_data[int(sum(month_len[:i+1])) : int(sum(month_len[:i+2]))].plot(figsize=(20, 5), title=c, color="green", fontsize=16, grid=True)
plt.figure(figsize(60,10))
sm.tsa.seasonal_decompose(taxi_data.taxi_call_num).plot()
print("Dickey-Fuller stationarity test: p=%f" % sm.tsa.stattools.adfuller(taxi_data.taxi_call_num)[1])
#double click on image makes it larger
plt.figure(figsize(15,8))
sm.tsa.seasonal_decompose(taxi_data.taxi_call_num[0:744]).plot()
print("Dickey-Fuller stationarity test: p=%f" % sm.tsa.stattools.adfuller(taxi_data.taxi_call_num[0:744])[1])
plt.figure(figsize=(15,4))
plt.plot(taxi_data.index, taxi_data.taxi_call_num)
""" moving average """
plt.plot(taxi_data.index, taxi_data.taxi_call_num.rolling(window=24).mean(), 'r')
T = 168
t = 28
time = np.arange(0, taxi_data.shape[0])
regression_taxi = taxi_data.copy()
#regression_taxi.loc[regression_taxi["taxi_call_num"] == 0] = 1
#regression_taxi['log_taxi_call_num'], lmbda = stats.boxcox(regression_taxi.taxi_call_num)
#t=28
#tt=48
#ttt = 14
for w in range(1, 100):
regression_taxi["sin_sin_w_" + str(w)] = np.sin(2*np.pi*w*time/T) * np.sin(2*np.pi*w*time/t)
regression_taxi["cos_cos_w_" + str(w)] = np.cos(2*np.pi*w*time/T) * np.cos(2*np.pi*w*time/t)
regression_taxi["sin_cos_w_" + str(w)] = np.sin(2*np.pi*w*time/T) * np.cos(2*np.pi*w*time/t)
regression_taxi["cos_sin_w_" + str(w)] = np.cos(2*np.pi*w*time/T) * np.sin(2*np.pi*w*time/t)
regression_taxi.head()
regression_taxi.shape
models = []
for w in range(1, 100):
m1 = smf.ols('taxi_call_num ~ ' + ' + '.join(list(regression_taxi.columns)[1:4*w+1]), data=regression_taxi)
model = m1.fit(cov_type='HC1')
models.append(model)
# fitted.summary(); fitted.params; models[-1].rsquared; params.Intercept
import warnings
warnings.filterwarnings('ignore')
fig1 = figure(figsize=(15,5))
ax1 = fig1.add_subplot(111)
line1 = ax1.plot(range(1, 100), list(map(lambda x: x.rsquared, models)), color="blue", label="r-square")
ylabel("r-squared")
ax2 = fig1.add_subplot(111, sharex=ax1, frameon=False)
line2 = ax2.plot(range(1, 100), list(map(lambda x: x.mse_resid, models)), color="green", label="resid")
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
ylabel("mse_resid")
legend((line1, line2), ("1", "2"))
plt.xlabel("number of fourier components")
legend()
grid()
show()
statsmodels.tsa.stattools.acf(x=models[3].resid, nlags=168)[168]
plt.figure(figsize=(15,6))
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[168] for model in models], color="blue", label='week')
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[24] for model in models], color="green", label='day')
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[14] for model in models], color="red", label='14 hours')
plt.legend()
plt.grid()
plt.xlabel("number of fourier components")
plt.ylabel("autocorellation with lag")
models[80].summary()
def prediction(model, components, feature_num, dataframe):
coefs = np.array([model.params[coef] for coef in model.params.index])
fourier_components = np.hstack((np.array([1]*dataframe.shape[0]).reshape(dataframe.shape[0],1),
dataframe.iloc[:, 1:feature_num*components+1].values))
return coefs.dot(fourier_components.T)
#model = models[40]
pred = prediction(models[80], components=81, feature_num=4, dataframe=regression_taxi)
def prediction_real(df_1, df_2, segment, title, ylabel, size, labels):
plt.figure(figsize=size)
plt.plot(df_1[segment[0]: segment[1]], alpha = 0.7, label = labels[0])
plt.plot(df_2[segment[0]: segment[1]], alpha=0.5, label = labels[1], color="red")
plt.xlabel('time (hours)')
plt.ylabel(ylabel)
plt.title(title)
plt.legend()
plt.grid()
prediction_real(pred, regression_taxi.taxi_call_num.values, (0, 745), "January", 'taxi calls', (15, 5), ("predict", "real"))
prediction_real(pred, regression_taxi.taxi_call_num.values, (0, 4344), "six months", 'taxi calls', (50, 5), ("predict", "real"))
#double click makes plot langer
prediction_real(regression_taxi.taxi_call_num.values, models[80].resid.values, (0, 745), "first january week", 'residuals',
(30, 5), ("real values", "residuals"))
prediction_real(regression_taxi.taxi_call_num.values, models[80].resid.values, (0, 168), "first january week", 'residuals',
(15, 5), ("real values", "residuals"))
plt.figure(figsize(80,10))
sm.tsa.seasonal_decompose(models[80].resid).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(models[80].resid)[1])
#sm.tsa.seasonal_decompose(models[80].resid).__dict__.keys()
#plt.plot(sm.tsa.seasonal_decompose(models[80].resid).__dict__["resid"][:96])
regression_taxi.iloc[:, :321].head()
full_regres_taxi = regression_taxi.iloc[:, :321]
full_regres_taxi.shape
#full_regres_taxi = full_regres_taxi.drop(['jan_anom'], axis=1)
full_regres_taxi['jan_anom'] = [full_regres_taxi.loc[date].taxi_call_num if date.month == 1 and date.day in [27] and date.year == 2015 else 0 for date in full_regres_taxi.index]
full_regres_taxi.shape
m1 = smf.ols('taxi_call_num ~ ' + ' + '.join(list(full_regres_taxi.columns)[1:]) + ' + jan_anom', data=full_regres_taxi)
fin_model_anomal = m1.fit(cov_type='HC1')
#result = prediction(fin_model_anomal, components=81, feature_num=4, dataframe=regression_taxi)
coefs = np.array([fin_model_anomal.params[coef] for coef in fin_model_anomal.params.index])
fourier_components = np.hstack((np.array([1]*full_regres_taxi.shape[0]).reshape(full_regres_taxi.shape[0],1),
full_regres_taxi.iloc[:, 1:4*80+2].values))
result = coefs.dot(fourier_components.T)
prediction_real(regression_taxi.taxi_call_num.values, result, (744, 1416), "february", 'taxi calls',
(15, 5), ("real values", "prediction"))
def plot_autocorr(resid_data):
plt.figure(figsize(15,10))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(resid_data.squeeze(), lags=168, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(resid_data.squeeze(), lags=168, ax=ax)
pylab.show()
plot_autocorr(fin_model_anomal.resid.values)
full_regres_taxi["residuals"] = fin_model_anomal.resid
full_regres_taxi["residual_diff1"] = full_regres_taxi["residuals"] - full_regres_taxi["residuals"].shift(1)
plot_autocorr(full_regres_taxi["residual_diff1"][1:].values)
ps = range(0, 12)
qs = range(0, 6)
Ps = range(0, 2)
Qs = range(0, 2)
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
%%time
residuals_results = []
residuals_best_aic = float("inf")
warnings.filterwarnings('ignore')
resid_wrong_param = 0
for counter, param in enumerate(parameters_list):
if counter % 10 == 0:
print(counter, end=' ')
#some of the parameters set are incompatible
try:
model=sm.tsa.statespace.SARIMAX(full_regres_taxi["residuals"], order=(param[0], 1, param[1]),
seasonal_order=(param[2], 0, param[3], 24)).fit(disp=-1)
#print failure parameters and continue
except LinAlgError:
print("singular matrix: ", param)
continue
except ValueError:
resid_wrong_param += 1
continue
aic = model.aic
#save best model, aic, parameters
if aic < residuals_best_aic:
residuals_best_model = model
residuals_best_aic = aic
residuals_best_param = param
residuals_results.append([param, model.aic])
warnings.filterwarnings('default')
print(sorted(residuals_results, key=lambda tup: tup[1], reverse=False)[:5])
plt.figure(figsize(15,12))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(residuals_best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(residuals_best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
best_arima = sm.tsa.statespace.SARIMAX(full_regres_taxi["residuals"], order=(11, 1, 1), seasonal_order=(1, 0, 1, 24)).fit(disp=-1)
plt.figure(figsize(15,8))
plt.subplot(211)
residuals_best_model.resid[1:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(residuals_best_model.resid[1:].values.squeeze(), lags=168, ax=ax)
print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(residuals_best_model.resid[1:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(residuals_best_model.resid[1:])[1])
prediction_real(full_regres_taxi["residuals"], best_arima.fittedvalues, (744, 1416), "february", 'taxi calls',
(15, 5), ("real residuals", "arima fitted values"))
prediction_real(full_regres_taxi["residuals"], best_arima.fittedvalues, (3624, 4344), "june", 'taxi calls',
(15, 5), ("real residuals", "arima fitted values"))
best_arima.fittedvalues.shape
final_res = result + best_arima.fittedvalues
prediction_real(regression_taxi.taxi_call_num.values, final_res.values, (3624, 4344), "june", 'taxi calls',
(15, 5), ("real taxi calls", "final prediction"))
prediction_real(regression_taxi.taxi_call_num.values, final_res.values, (744, 1416), "february", 'taxi calls',
(15, 5), ("real taxi calls", "final prediction"))
diff_data=pd.DataFrame(taxi_data.taxi_call_num, columns=["taxi_data.taxi_call_num"])
plot_autocorr(taxi_data.taxi_call_num.values)
diff_data["diff_168"] = taxi_data.taxi_call_num - taxi_data.taxi_call_num.shift(168)
plot_autocorr(diff_data["diff_168"][168:].values)
diff_data["diff_168_1"] = diff_data["diff_168"][168:] - diff_data["diff_168"][168:].shift(1)
plot_autocorr(diff_data["diff_168_1"][169:].values)
ps = range(0, 12)
qs = range(0, 6)
Ps = range(0, 2)
Qs = range(0, 2)
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
wrong_parameters_counter = 0
for counter, param in enumerate(parameters_list):
if counter % 10 == 0:
print(counter, end=' ')
#some of the parameters set are incompatible
try:
model=sm.tsa.statespace.SARIMAX(taxi_data.taxi_call_num, order=(param[0], 1, param[1]),
seasonal_order=(param[2], 1, param[3], 24)).fit(disp=-1)
#print failure parameters and continue
except LinAlgError:
print("singular matrix: ", param)
continue
except ValueError:
wrong_parameters_counter += 1
continue
aic = model.aic
#save best model, aic, parameters
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results.append([param, model.aic])
warnings.filterwarnings('default')
print(wrong_parameters_counter)
print(sorted(results, key=lambda tup: tup[1], reverse=False)[:5])
plt.figure(figsize(15,12))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[1:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[1:].values.squeeze(), lags=168, ax=ax)
print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[1:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[1:])[1])
Согласно критерию Стьюдента - остатки не смещены
best_arima_1 = sm.tsa.statespace.SARIMAX(taxi_data.taxi_call_num, order=(9, 1, 1),
seasonal_order=(1, 1, 1, 24)).fit(disp=-1)
prediction_real(regression_taxi.taxi_call_num.values, best_arima_1.fittedvalues.values, (3624, 4344), "june", 'taxi calls',
(15, 5), ("real taxi calls", "final SARIMA fitted"))
prediction_real(regression_taxi.taxi_call_num.values, best_arima_1.fittedvalues.values, (744, 1416), "february", 'taxi calls',
(15, 5), ("real taxi calls", "final SARIMA fitted"))